mta_data = read_csv (file = "./data/MTA_recent_ridership_data_20201123_0.csv",
                     col_types = cols(
                       date = col_date(format = "%mm/%dd/%yy"),
                       `Subways: % Change From 2019 Equivalent Day` = col_number(),
                       `Buses: % Change From 2019 Equivalent Day` = col_number(),
                       `Bridges and Tunnels: % Change From 2019 Equivalent Day` = col_number()
                       ) #only changed the formats of important variables 
) %>%
  janitor::clean_names()
skimr::skim(mta_data)
Data summary
Name mta_data
Number of rows 268
Number of columns 13
_______________________
Column type frequency:
character 4
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
date 0 1.00 6 8 0 268 0
lirr_percent_change_from_2019_monthly_weekday_saturday_sunday_average 31 0.88 4 4 0 41 0
metro_north_percent_change_from_2019_monthly_weekday_saturday_sunday_average 31 0.88 4 4 0 39 0
access_a_ride_percent_change_from_2019_monthly_weekday_saturday_sunday_average 0 1.00 5 7 0 222 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
subways_total_estimated_ridership 0 1.00 1188754.06 907206.69 198693.0 590202.25 1082366.5 1606546.00 5515945.0 ▇▅▁▁▁
subways_percent_change_from_2019_equivalent_day 0 1.00 -74.53 17.80 -93.5 -87.40 -75.2 -69.90 23.9 ▇▂▁▁▁
buses_total_estimated_ridership 0 1.00 898386.33 367033.69 279100.0 591025.00 948100.0 1109492.00 2244500.0 ▇▇▇▁▁
buses_percent_change_from_2019_equivalent_day 0 1.00 -51.25 18.09 -84.0 -63.50 -52.0 -43.00 40.0 ▃▇▁▁▁
lirr_total_estimated_ridership 31 0.88 49484.81 29227.32 1900.0 22800.00 46800.0 76600.00 92500.0 ▆▅▅▅▇
metro_north_total_estimated_ridership 31 0.88 40194.51 19249.81 5100.0 21200.00 43600.0 58200.00 77300.0 ▇▅▇▇▅
access_a_ride_total_scheduled_trips 0 1.00 14138.28 6772.45 2506.0 8404.50 13073.0 19894.75 34304.0 ▇▇▇▂▁
bridges_and_tunnels_total_traffic 0 1.00 668395.02 189440.49 177590.0 541411.50 744720.0 815673.75 938167.0 ▁▃▃▆▇
bridges_and_tunnels_percent_change_from_2019_equivalent_day 0 1.00 -28.59 20.48 -80.1 -43.02 -20.4 -13.57 27.4 ▃▃▇▇▁
mta_data =
  mta_data %>%
  subset(select = -c(lirr_total_estimated_ridership, lirr_percent_change_from_2019_monthly_weekday_saturday_sunday_average, metro_north_total_estimated_ridership, metro_north_percent_change_from_2019_monthly_weekday_saturday_sunday_average, access_a_ride_total_scheduled_trips, access_a_ride_percent_change_from_2019_monthly_weekday_saturday_sunday_average, bridges_and_tunnels_total_traffic, bridges_and_tunnels_percent_change_from_2019_equivalent_day))
mta_data = 
  mta_data %>%
  mutate( 
    'subway_2019' = subways_total_estimated_ridership/(1+(subways_percent_change_from_2019_equivalent_day/100)),
    'bus_2019'=
      buses_total_estimated_ridership/(1+(buses_percent_change_from_2019_equivalent_day/100))
    ) %>%
  rename(
    "subway_2020" = subways_total_estimated_ridership,
    "subway_pct_change" = subways_percent_change_from_2019_equivalent_day,
    "bus_2020" = buses_total_estimated_ridership,
    "bus_pct_change" = buses_percent_change_from_2019_equivalent_day
    )
#change date to date format and order by date
plot_mta =
  mta_data %>%
  mutate(
    date= as.Date(date,format = "%m/%d")) %>%
  arrange(date)
#create a text_label label
text_label =
  plot_mta %>%
  mutate(text_label = str_c("percent change from 2019 to 2020: ", subway_pct_change)) %>%
  select(date, text_label)

#setting up graph for subway
#pivoting
plot_subway = 
  plot_mta %>%
  select(subway_2020, subway_2019, date) %>%
  melt(., id.vars = "date") %>%
#merge based on month 
  merge(text_label, by = "date")
#setting up graph for bus
#pivoting
plot_bus = 
  plot_mta %>%
  select(bus_2020,bus_2019,date) %>%
  melt(., id.vars = "date") %>% 
#merge based on month 
  merge(text_label, by = "date")
#setting up for t-test
#separate by month and day
mta_data = 
  mta_data %>%
  arrange(date) %>%
  separate(date, into = c("month", "day", "year"))%>%
  mutate(month = as.numeric(month),
         day = as.numeric(day)) %>%
  select(-c(year)) #drop year column

mta_subway_ridership =
  mta_data %>%
  group_by(month)%>%
  summarize(
    avg_subway_2019 = mean(subway_2019),
    avg_subway_2020 = mean(subway_2020)
  )

mta_2019_sample = 
  mta_data%>%
  select(month, subway_2019) %>%
  nest(subway_2019)%>%
  mutate("subway_2019_sample" = data)%>%
  select(-data)

mta_2020_sample = 
  mta_data%>%
  select(month, subway_2020)%>%
  nest(subway_2020)%>%
  mutate("subway_2020_sample" = data)%>%
  select(-data)

mta_samples = 
  bind_cols(mta_2019_sample, mta_2020_sample)%>%
  select(-month...3)%>%
  rename(month = month...1)
mta_t_test = 
  mta_samples%>%
  mutate(t_test = map2(.x = subway_2019_sample, .y = subway_2020_sample, ~t.test(.x , .y) ),
         t_test_results = map(t_test, broom::tidy))%>%
  select(month, t_test_results)%>%
  unnest(t_test_results)%>%
  select(month,p.value)%>%
  mutate(difference = case_when(
    p.value >= 0.05 ~ "insignificant",
    p.value < 0.05 ~ "significant"),
    p.value = ifelse(
    p.value< 0.001,"<0.001",round(p.value, digits = 4))) %>%
  arrange(month) 

mta_year_ttest = 
  bind_cols(mta_subway_ridership, mta_t_test)%>%
  select(-month...4)%>%
  rename(month = month...1) 
#create a text_label label
text_label =
  mta_year_ttest %>%
  mutate(text_label = str_c("p-value: ",p.value, "\nDifference: ", difference)) %>%
  select(month, text_label)
#pivoting
plot_ttest = 
  mta_subway_ridership %>%
  rename(
    "2020"=avg_subway_2020,
    "2019"=avg_subway_2019
  ) %>%
  melt(., id.vars = "month") %>%
#merge based on month 
  merge(text_label, by = "month")
subway_ridership = lm(subway_2020 ~ month, data = mta_data)
subway_ridership %>% 
  broom::tidy() %>% 
  select(term, estimate, p.value) %>% 
  knitr::kable(digits = 3)
term estimate p.value
(Intercept) 965079.87 0.000
month 32472.74 0.139
#to plot for regression line 
  ggplot(mta_data, aes(month, subway_2020)) +
  geom_point() +
  stat_smooth(method = lm)

mta_data %>% 
  modelr::add_predictions(subway_ridership) %>% 
  modelr::add_residuals(subway_ridership) %>% 
  ggplot(aes(x = pred, y = resid)) + geom_point() + 
  labs(x = "Predicted value", 
       y = "Residual")

Next, we need to propose a regression model for bus ridership in 2020. As the outcome (bus ridership) is continuous, we can fit a linear regression model.

bus_ridership = lm(bus_2020 ~ month, data = mta_data)
bus_ridership %>% 
  broom::tidy() 
# A tibble: 2 x 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)  717060.    64077.     11.2  4.46e-24
2 month         26325.     8733.      3.01 2.82e- 3
#Now we need to tidy the output and get only the intercept, slope and p-values 
bus_ridership %>% 
  broom::tidy() %>% 
  select(term, estimate, p.value) %>% 
  knitr::kable(digits = 3)
term estimate p.value
(Intercept) 717060.27 0.000
month 26324.69 0.003
#to plot for regression line
ggplot(mta_data, aes(month, bus_2020)) +
  geom_point() +
  stat_smooth(method = lm)



Column {data-width=650}
-----------------------------------------------------------------------

### Chart A

<div class="knitr-options" data-fig-width="768" data-fig-height="576"></div>


```r
plot_ttest %>%
plot_ly(
    x = ~month, y = ~value, type = "scatter", mode = "lines+markers",
    color = ~variable, text = ~text_label) %>%
  layout (
    title = "Monthly Average Ridership of Subway 2019 vs 2020",
    xaxis = list(title ="Months",range=c(3,11)),
    yaxis = list(title="Average Ridership"),
    legend = list(font = list(size = 10))
    )

Column

Chart B

plot_subway %>%
  plot_ly(
    x = ~date, y = ~value, type = "scatter", mode = "markers",
    color = ~variable, text = ~text_label) %>%
  layout (
    title = "Subway Ridership Trends 2019 - 2020",
    xaxis = list(title ="Month/Day", tickformat = "%m/%d"), #drop year
    yaxis = list(title="Ridership"))  %>%
  add_lines(x =as.Date("2020-03-01"), line = list(dash="dot", color = 'red', width=0.5, opacity = 0.5),name = 'First case on 3/1') %>%
  add_lines(x =as.Date("2020-04-07"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '100K cases in NYC on 04/07') %>%
  add_lines(x =as.Date("2020-05-26"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '200K cases in NYC on 05/26')

Chart C

#plot_bus
plot_bus  %>%
  plot_ly(
    x = ~date, y = ~value, type = "scatter", mode = "markers",
    color = ~variable, text = ~text_label) %>%
  layout (
    title = "Bus Ridership Trends 2019 - 2020",
    xaxis = list(title ="Month/Day", tickformat = "%m/%d"),
    yaxis = list(title="Ridership")) %>%
  add_lines(x =as.Date("2020-03-01"), line = list(dash="dot", color = 'red', width=0.5, opacity = 0.5),name = 'First case on 3/1') %>%
  add_lines(x =as.Date("2020-04-07"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '100K cases in NYC on 04/07') %>%
  add_lines(x =as.Date("2020-05-26"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '200K cases in NYC on 05/26')
---
title: "MTA ridership 2019 - 20 Dashboard"
output: 
  flexdashboard::flex_dashboard:
    orientation: columns
    vertical_layout: fill
    source: embed
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(patchwork)
library(readr)
library(broom)
library(dbplyr)
library(viridis)
library(reshape2)
library(plotly)
library(lubridate)

knitr::opts_chunk$set(
	echo = TRUE,
	warning = FALSE,
	fig.width = 8, 
  fig.height = 6,
  out.width = "90%"
)

options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)

scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

theme_set(theme_minimal() + theme(legend.position = "bottom"))
```

``` {r importing data}
mta_data = read_csv (file = "./data/MTA_recent_ridership_data_20201123_0.csv",
                     col_types = cols(
                       date = col_date(format = "%mm/%dd/%yy"),
                       `Subways: % Change From 2019 Equivalent Day` = col_number(),
                       `Buses: % Change From 2019 Equivalent Day` = col_number(),
                       `Bridges and Tunnels: % Change From 2019 Equivalent Day` = col_number()
                       ) #only changed the formats of important variables 
) %>%
  janitor::clean_names()
skimr::skim(mta_data)
mta_data =
  mta_data %>%
  subset(select = -c(lirr_total_estimated_ridership, lirr_percent_change_from_2019_monthly_weekday_saturday_sunday_average, metro_north_total_estimated_ridership, metro_north_percent_change_from_2019_monthly_weekday_saturday_sunday_average, access_a_ride_total_scheduled_trips, access_a_ride_percent_change_from_2019_monthly_weekday_saturday_sunday_average, bridges_and_tunnels_total_traffic, bridges_and_tunnels_percent_change_from_2019_equivalent_day))
```

``` {r calculate 2019 ridership bus subway}
mta_data = 
  mta_data %>%
  mutate( 
    'subway_2019' = subways_total_estimated_ridership/(1+(subways_percent_change_from_2019_equivalent_day/100)),
    'bus_2019'=
      buses_total_estimated_ridership/(1+(buses_percent_change_from_2019_equivalent_day/100))
    ) %>%
  rename(
    "subway_2020" = subways_total_estimated_ridership,
    "subway_pct_change" = subways_percent_change_from_2019_equivalent_day,
    "bus_2020" = buses_total_estimated_ridership,
    "bus_pct_change" = buses_percent_change_from_2019_equivalent_day
    )
```

```{r plotting general trends of MTA ridership 2019 & 2020}
#change date to date format and order by date
plot_mta =
  mta_data %>%
  mutate(
    date= as.Date(date,format = "%m/%d")) %>%
  arrange(date)
#create a text_label label
text_label =
  plot_mta %>%
  mutate(text_label = str_c("percent change from 2019 to 2020: ", subway_pct_change)) %>%
  select(date, text_label)

#setting up graph for subway
#pivoting
plot_subway = 
  plot_mta %>%
  select(subway_2020, subway_2019, date) %>%
  melt(., id.vars = "date") %>%
#merge based on month 
  merge(text_label, by = "date")
#setting up graph for bus
#pivoting
plot_bus = 
  plot_mta %>%
  select(bus_2020,bus_2019,date) %>%
  melt(., id.vars = "date") %>% 
#merge based on month 
  merge(text_label, by = "date")
```

```{r}
#setting up for t-test
#separate by month and day
mta_data = 
  mta_data %>%
  arrange(date) %>%
  separate(date, into = c("month", "day", "year"))%>%
  mutate(month = as.numeric(month),
         day = as.numeric(day)) %>%
  select(-c(year)) #drop year column

mta_subway_ridership =
  mta_data %>%
  group_by(month)%>%
  summarize(
    avg_subway_2019 = mean(subway_2019),
    avg_subway_2020 = mean(subway_2020)
  )

mta_2019_sample = 
  mta_data%>%
  select(month, subway_2019) %>%
  nest(subway_2019)%>%
  mutate("subway_2019_sample" = data)%>%
  select(-data)

mta_2020_sample = 
  mta_data%>%
  select(month, subway_2020)%>%
  nest(subway_2020)%>%
  mutate("subway_2020_sample" = data)%>%
  select(-data)

mta_samples = 
  bind_cols(mta_2019_sample, mta_2020_sample)%>%
  select(-month...3)%>%
  rename(month = month...1)
```

```{r t test}
mta_t_test = 
  mta_samples%>%
  mutate(t_test = map2(.x = subway_2019_sample, .y = subway_2020_sample, ~t.test(.x , .y) ),
         t_test_results = map(t_test, broom::tidy))%>%
  select(month, t_test_results)%>%
  unnest(t_test_results)%>%
  select(month,p.value)%>%
  mutate(difference = case_when(
    p.value >= 0.05 ~ "insignificant",
    p.value < 0.05 ~ "significant"),
    p.value = ifelse(
    p.value< 0.001,"<0.001",round(p.value, digits = 4))) %>%
  arrange(month) 

mta_year_ttest = 
  bind_cols(mta_subway_ridership, mta_t_test)%>%
  select(-month...4)%>%
  rename(month = month...1) 
```

``` {r plotting set up}
#create a text_label label
text_label =
  mta_year_ttest %>%
  mutate(text_label = str_c("p-value: ",p.value, "\nDifference: ", difference)) %>%
  select(month, text_label)
#pivoting
plot_ttest = 
  mta_subway_ridership %>%
  rename(
    "2020"=avg_subway_2020,
    "2019"=avg_subway_2019
  ) %>%
  melt(., id.vars = "month") %>%
#merge based on month 
  merge(text_label, by = "month")
```
```{r regression models for subway 2020}
subway_ridership = lm(subway_2020 ~ month, data = mta_data)
subway_ridership %>% 
  broom::tidy() %>% 
  select(term, estimate, p.value) %>% 
  knitr::kable(digits = 3)

#to plot for regression line 
  ggplot(mta_data, aes(month, subway_2020)) +
  geom_point() +
  stat_smooth(method = lm)
```

* Show a plot of model residuals against fitted values – use add_predictions and add_residuals in making this plot.

```{r}
mta_data %>% 
  modelr::add_predictions(subway_ridership) %>% 
  modelr::add_residuals(subway_ridership) %>% 
  ggplot(aes(x = pred, y = resid)) + geom_point() + 
  labs(x = "Predicted value", 
       y = "Residual")
```

Next, we need to propose a regression model for bus ridership in 2020.
As the outcome (bus ridership) is continuous, we can fit a linear regression model. 

```{r}
bus_ridership = lm(bus_2020 ~ month, data = mta_data)
bus_ridership %>% 
  broom::tidy() 

#Now we need to tidy the output and get only the intercept, slope and p-values 
bus_ridership %>% 
  broom::tidy() %>% 
  select(term, estimate, p.value) %>% 
  knitr::kable(digits = 3)

#to plot for regression line
ggplot(mta_data, aes(month, bus_2020)) +
  geom_point() +
  stat_smooth(method = lm)
```
```


Column {data-width=650}
-----------------------------------------------------------------------

### Chart A

```{r}
plot_ttest %>%
plot_ly(
    x = ~month, y = ~value, type = "scatter", mode = "lines+markers",
    color = ~variable, text = ~text_label) %>%
  layout (
    title = "Monthly Average Ridership of Subway 2019 vs 2020",
    xaxis = list(title ="Months",range=c(3,11)),
    yaxis = list(title="Average Ridership"),
    legend = list(font = list(size = 10))
    )
```


Column {data-width=350}
-----------------------------------------------------------------------

### Chart B

```{r}
plot_subway %>%
  plot_ly(
    x = ~date, y = ~value, type = "scatter", mode = "markers",
    color = ~variable, text = ~text_label) %>%
  layout (
    title = "Subway Ridership Trends 2019 - 2020",
    xaxis = list(title ="Month/Day", tickformat = "%m/%d"), #drop year
    yaxis = list(title="Ridership"))  %>%
  add_lines(x =as.Date("2020-03-01"), line = list(dash="dot", color = 'red', width=0.5, opacity = 0.5),name = 'First case on 3/1') %>%
  add_lines(x =as.Date("2020-04-07"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '100K cases in NYC on 04/07') %>%
  add_lines(x =as.Date("2020-05-26"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '200K cases in NYC on 05/26')
```

### Chart C

```{r}
#plot_bus
plot_bus  %>%
  plot_ly(
    x = ~date, y = ~value, type = "scatter", mode = "markers",
    color = ~variable, text = ~text_label) %>%
  layout (
    title = "Bus Ridership Trends 2019 - 2020",
    xaxis = list(title ="Month/Day", tickformat = "%m/%d"),
    yaxis = list(title="Ridership")) %>%
  add_lines(x =as.Date("2020-03-01"), line = list(dash="dot", color = 'red', width=0.5, opacity = 0.5),name = 'First case on 3/1') %>%
  add_lines(x =as.Date("2020-04-07"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '100K cases in NYC on 04/07') %>%
  add_lines(x =as.Date("2020-05-26"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '200K cases in NYC on 05/26')
```